In this project, I produced a sentiment analysis covering over 500,000 reviews from an Amazon Fine Food Review dataset. I classified all positive and negative customer reviews and then created word clouds, plotly visualizations, and a text classification model to display my analysis further.
For this project, I used the Amazon Fine Food Review dataset found on Kaggle.
# Installs
! pip install plotly
! pip install cufflinks
! pip install seaborn
! pip install wordcloud
! pip install numpy
! pip install nltk
# Imports
import os, types
import pandas as pd
import numpy as np
import seaborn as sns
import nltk
nltk.download('stopwords')
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
%matplotlib inline
import matplotlib.pyplot as plt
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import cufflinks as cf
cf.go_offline()
import plotly.express as px
import re
def plot_cloud(wordcloud):
# Set figure size
plt.figure(figsize=(40, 30))
# Display image
plt.imshow(wordcloud)
# No axis details
plt.axis("off");
from wordcloud import WordCloud, STOPWORDS
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
def __iter__(self): return 0
Requirement already satisfied: plotly in /opt/anaconda3/lib/python3.8/site-packages (5.1.0) Requirement already satisfied: six in /opt/anaconda3/lib/python3.8/site-packages (from plotly) (1.15.0) Requirement already satisfied: tenacity>=6.2.0 in /opt/anaconda3/lib/python3.8/site-packages (from plotly) (7.0.0) Requirement already satisfied: cufflinks in /opt/anaconda3/lib/python3.8/site-packages (0.17.3) Requirement already satisfied: plotly>=4.1.1 in /opt/anaconda3/lib/python3.8/site-packages (from cufflinks) (5.1.0) Requirement already satisfied: numpy>=1.9.2 in /opt/anaconda3/lib/python3.8/site-packages (from cufflinks) (1.20.1) Requirement already satisfied: ipywidgets>=7.0.0 in /opt/anaconda3/lib/python3.8/site-packages (from cufflinks) (7.6.3) Requirement already satisfied: pandas>=0.19.2 in /opt/anaconda3/lib/python3.8/site-packages (from cufflinks) (1.2.4) Requirement already satisfied: ipython>=5.3.0 in /opt/anaconda3/lib/python3.8/site-packages (from cufflinks) (7.22.0) Requirement already satisfied: colorlover>=0.2.1 in /opt/anaconda3/lib/python3.8/site-packages (from cufflinks) (0.3.0) Requirement already satisfied: setuptools>=34.4.1 in /opt/anaconda3/lib/python3.8/site-packages (from cufflinks) (52.0.0.post20210125) Requirement already satisfied: six>=1.9.0 in /opt/anaconda3/lib/python3.8/site-packages (from cufflinks) (1.15.0) Requirement already satisfied: decorator in /opt/anaconda3/lib/python3.8/site-packages (from ipython>=5.3.0->cufflinks) (5.0.6) Requirement already satisfied: prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0 in /opt/anaconda3/lib/python3.8/site-packages (from ipython>=5.3.0->cufflinks) (3.0.17) Requirement already satisfied: pexpect>4.3 in /opt/anaconda3/lib/python3.8/site-packages (from ipython>=5.3.0->cufflinks) (4.8.0) Requirement already satisfied: backcall in /opt/anaconda3/lib/python3.8/site-packages (from ipython>=5.3.0->cufflinks) (0.2.0) Requirement already satisfied: pickleshare in /opt/anaconda3/lib/python3.8/site-packages (from ipython>=5.3.0->cufflinks) (0.7.5) Requirement already satisfied: jedi>=0.16 in /opt/anaconda3/lib/python3.8/site-packages (from ipython>=5.3.0->cufflinks) (0.17.2) Requirement already satisfied: pygments in /opt/anaconda3/lib/python3.8/site-packages (from ipython>=5.3.0->cufflinks) (2.8.1) Requirement already satisfied: appnope in /opt/anaconda3/lib/python3.8/site-packages (from ipython>=5.3.0->cufflinks) (0.1.2) Requirement already satisfied: traitlets>=4.2 in /opt/anaconda3/lib/python3.8/site-packages (from ipython>=5.3.0->cufflinks) (5.0.5) Requirement already satisfied: jupyterlab-widgets>=1.0.0 in /opt/anaconda3/lib/python3.8/site-packages (from ipywidgets>=7.0.0->cufflinks) (1.0.0) Requirement already satisfied: nbformat>=4.2.0 in /opt/anaconda3/lib/python3.8/site-packages (from ipywidgets>=7.0.0->cufflinks) (5.1.3) Requirement already satisfied: ipykernel>=4.5.1 in /opt/anaconda3/lib/python3.8/site-packages (from ipywidgets>=7.0.0->cufflinks) (5.3.4) Requirement already satisfied: widgetsnbextension~=3.5.0 in /opt/anaconda3/lib/python3.8/site-packages (from ipywidgets>=7.0.0->cufflinks) (3.5.1) Requirement already satisfied: jupyter-client in /opt/anaconda3/lib/python3.8/site-packages (from ipykernel>=4.5.1->ipywidgets>=7.0.0->cufflinks) (6.1.12) Requirement already satisfied: tornado>=4.2 in /opt/anaconda3/lib/python3.8/site-packages (from ipykernel>=4.5.1->ipywidgets>=7.0.0->cufflinks) (6.1) Requirement already satisfied: parso<0.8.0,>=0.7.0 in /opt/anaconda3/lib/python3.8/site-packages (from jedi>=0.16->ipython>=5.3.0->cufflinks) (0.7.0) Requirement already satisfied: ipython-genutils in /opt/anaconda3/lib/python3.8/site-packages (from nbformat>=4.2.0->ipywidgets>=7.0.0->cufflinks) (0.2.0) Requirement already satisfied: jsonschema!=2.5.0,>=2.4 in /opt/anaconda3/lib/python3.8/site-packages (from nbformat>=4.2.0->ipywidgets>=7.0.0->cufflinks) (3.2.0) Requirement already satisfied: jupyter-core in /opt/anaconda3/lib/python3.8/site-packages (from nbformat>=4.2.0->ipywidgets>=7.0.0->cufflinks) (4.7.1) Requirement already satisfied: attrs>=17.4.0 in /opt/anaconda3/lib/python3.8/site-packages (from jsonschema!=2.5.0,>=2.4->nbformat>=4.2.0->ipywidgets>=7.0.0->cufflinks) (20.3.0) Requirement already satisfied: pyrsistent>=0.14.0 in /opt/anaconda3/lib/python3.8/site-packages (from jsonschema!=2.5.0,>=2.4->nbformat>=4.2.0->ipywidgets>=7.0.0->cufflinks) (0.17.3) Requirement already satisfied: python-dateutil>=2.7.3 in /opt/anaconda3/lib/python3.8/site-packages (from pandas>=0.19.2->cufflinks) (2.8.1) Requirement already satisfied: pytz>=2017.3 in /opt/anaconda3/lib/python3.8/site-packages (from pandas>=0.19.2->cufflinks) (2021.1) Requirement already satisfied: ptyprocess>=0.5 in /opt/anaconda3/lib/python3.8/site-packages (from pexpect>4.3->ipython>=5.3.0->cufflinks) (0.7.0) Requirement already satisfied: tenacity>=6.2.0 in /opt/anaconda3/lib/python3.8/site-packages (from plotly>=4.1.1->cufflinks) (7.0.0) Requirement already satisfied: wcwidth in /opt/anaconda3/lib/python3.8/site-packages (from prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0->ipython>=5.3.0->cufflinks) (0.2.5) Requirement already satisfied: notebook>=4.4.1 in /opt/anaconda3/lib/python3.8/site-packages (from widgetsnbextension~=3.5.0->ipywidgets>=7.0.0->cufflinks) (6.3.0) Requirement already satisfied: argon2-cffi in /opt/anaconda3/lib/python3.8/site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0->cufflinks) (20.1.0) Requirement already satisfied: nbconvert in /opt/anaconda3/lib/python3.8/site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0->cufflinks) (6.0.7) Requirement already satisfied: Send2Trash>=1.5.0 in /opt/anaconda3/lib/python3.8/site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0->cufflinks) (1.5.0) Requirement already satisfied: jinja2 in /opt/anaconda3/lib/python3.8/site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0->cufflinks) (2.11.3) Requirement already satisfied: terminado>=0.8.3 in /opt/anaconda3/lib/python3.8/site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0->cufflinks) (0.9.4) Requirement already satisfied: prometheus-client in /opt/anaconda3/lib/python3.8/site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0->cufflinks) (0.10.1) Requirement already satisfied: pyzmq>=17 in /opt/anaconda3/lib/python3.8/site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0->cufflinks) (20.0.0) Requirement already satisfied: cffi>=1.0.0 in /opt/anaconda3/lib/python3.8/site-packages (from argon2-cffi->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0->cufflinks) (1.14.5) Requirement already satisfied: pycparser in /opt/anaconda3/lib/python3.8/site-packages (from cffi>=1.0.0->argon2-cffi->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0->cufflinks) (2.20) Requirement already satisfied: MarkupSafe>=0.23 in /opt/anaconda3/lib/python3.8/site-packages (from jinja2->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0->cufflinks) (1.1.1) Requirement already satisfied: testpath in /opt/anaconda3/lib/python3.8/site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0->cufflinks) (0.4.4) Requirement already satisfied: nbclient<0.6.0,>=0.5.0 in /opt/anaconda3/lib/python3.8/site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0->cufflinks) (0.5.3) Requirement already satisfied: pandocfilters>=1.4.1 in /opt/anaconda3/lib/python3.8/site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0->cufflinks) (1.4.3) Requirement already satisfied: mistune<2,>=0.8.1 in /opt/anaconda3/lib/python3.8/site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0->cufflinks) (0.8.4) Requirement already satisfied: entrypoints>=0.2.2 in /opt/anaconda3/lib/python3.8/site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0->cufflinks) (0.3) Requirement already satisfied: bleach in /opt/anaconda3/lib/python3.8/site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0->cufflinks) (3.3.0) Requirement already satisfied: jupyterlab-pygments in /opt/anaconda3/lib/python3.8/site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0->cufflinks) (0.1.2) Requirement already satisfied: defusedxml in /opt/anaconda3/lib/python3.8/site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0->cufflinks) (0.7.1) Requirement already satisfied: async-generator in /opt/anaconda3/lib/python3.8/site-packages (from nbclient<0.6.0,>=0.5.0->nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0->cufflinks) (1.10) Requirement already satisfied: nest-asyncio in /opt/anaconda3/lib/python3.8/site-packages (from nbclient<0.6.0,>=0.5.0->nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0->cufflinks) (1.5.1) Requirement already satisfied: packaging in /opt/anaconda3/lib/python3.8/site-packages (from bleach->nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0->cufflinks) (20.9) Requirement already satisfied: webencodings in /opt/anaconda3/lib/python3.8/site-packages (from bleach->nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0->cufflinks) (0.5.1) Requirement already satisfied: pyparsing>=2.0.2 in /opt/anaconda3/lib/python3.8/site-packages (from packaging->bleach->nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.0.0->cufflinks) (2.4.7) Requirement already satisfied: seaborn in /opt/anaconda3/lib/python3.8/site-packages (0.11.1) Requirement already satisfied: scipy>=1.0 in /opt/anaconda3/lib/python3.8/site-packages (from seaborn) (1.6.2) Requirement already satisfied: pandas>=0.23 in /opt/anaconda3/lib/python3.8/site-packages (from seaborn) (1.2.4) Requirement already satisfied: matplotlib>=2.2 in /opt/anaconda3/lib/python3.8/site-packages (from seaborn) (3.3.4) Requirement already satisfied: numpy>=1.15 in /opt/anaconda3/lib/python3.8/site-packages (from seaborn) (1.20.1) Requirement already satisfied: pillow>=6.2.0 in /opt/anaconda3/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (8.2.0) Requirement already satisfied: python-dateutil>=2.1 in /opt/anaconda3/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (2.8.1) Requirement already satisfied: kiwisolver>=1.0.1 in /opt/anaconda3/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (1.3.1) Requirement already satisfied: cycler>=0.10 in /opt/anaconda3/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (0.10.0) Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in /opt/anaconda3/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (2.4.7) Requirement already satisfied: six in /opt/anaconda3/lib/python3.8/site-packages (from cycler>=0.10->matplotlib>=2.2->seaborn) (1.15.0) Requirement already satisfied: pytz>=2017.3 in /opt/anaconda3/lib/python3.8/site-packages (from pandas>=0.23->seaborn) (2021.1) Requirement already satisfied: wordcloud in /opt/anaconda3/lib/python3.8/site-packages (1.8.1) Requirement already satisfied: matplotlib in /opt/anaconda3/lib/python3.8/site-packages (from wordcloud) (3.3.4) Requirement already satisfied: numpy>=1.6.1 in /opt/anaconda3/lib/python3.8/site-packages (from wordcloud) (1.20.1) Requirement already satisfied: pillow in /opt/anaconda3/lib/python3.8/site-packages (from wordcloud) (8.2.0) Requirement already satisfied: python-dateutil>=2.1 in /opt/anaconda3/lib/python3.8/site-packages (from matplotlib->wordcloud) (2.8.1) Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in /opt/anaconda3/lib/python3.8/site-packages (from matplotlib->wordcloud) (2.4.7) Requirement already satisfied: cycler>=0.10 in /opt/anaconda3/lib/python3.8/site-packages (from matplotlib->wordcloud) (0.10.0) Requirement already satisfied: kiwisolver>=1.0.1 in /opt/anaconda3/lib/python3.8/site-packages (from matplotlib->wordcloud) (1.3.1) Requirement already satisfied: six in /opt/anaconda3/lib/python3.8/site-packages (from cycler>=0.10->matplotlib->wordcloud) (1.15.0) Requirement already satisfied: numpy in /opt/anaconda3/lib/python3.8/site-packages (1.20.1) Requirement already satisfied: nltk in /opt/anaconda3/lib/python3.8/site-packages (3.6.1) Requirement already satisfied: click in /opt/anaconda3/lib/python3.8/site-packages (from nltk) (7.1.2) Requirement already satisfied: regex in /opt/anaconda3/lib/python3.8/site-packages (from nltk) (2021.4.4) Requirement already satisfied: tqdm in /opt/anaconda3/lib/python3.8/site-packages (from nltk) (4.59.0) Requirement already satisfied: joblib in /opt/anaconda3/lib/python3.8/site-packages (from nltk) (1.0.1)
[nltk_data] Downloading package stopwords to [nltk_data] /Users/keithmoses/nltk_data... [nltk_data] Package stopwords is already up-to-date!
print("All imports installed...!")
All imports installed...!
df_data_1 = pd.read_csv('Reviews.csv')
amazon = df_data_1
amazon.head()
| Id | ProductId | UserId | ProfileName | HelpfulnessNumerator | HelpfulnessDenominator | Score | Time | Summary | Text | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | B001E4KFG0 | A3SGXH7AUHU8GW | delmartian | 1 | 1 | 5 | 1303862400 | Good Quality Dog Food | I have bought several of the Vitality canned d... |
| 1 | 2 | B00813GRG4 | A1D87F6ZCVE5NK | dll pa | 0 | 0 | 1 | 1346976000 | Not as Advertised | Product arrived labeled as Jumbo Salted Peanut... |
| 2 | 3 | B000LQOCH0 | ABXLMWJIXXAIN | Natalia Corres "Natalia Corres" | 1 | 1 | 4 | 1219017600 | "Delight" says it all | This is a confection that has been around a fe... |
| 3 | 4 | B000UA0QIQ | A395BORC6FGVXV | Karl | 3 | 3 | 2 | 1307923200 | Cough Medicine | If you are looking for the secret ingredient i... |
| 4 | 5 | B006K2ZZ7K | A1UQRSCLF8GW1T | Michael D. Bigham "M. Wassir" | 0 | 0 | 5 | 1350777600 | Great taffy | Great taffy at a great price. There was a wid... |
To prepare for this analysis, I visualized the product scores from the dataset in a histogram using the plotly library.
# Visualizing Product Scores - Histogram
fig = px.histogram(amazon, x="Score")
fig.update_layout(title_text = "Product Score")
fig.show()
From the blue histogram, we can see more positive customer ratings than negative. Therefore, the majority of Amazon’s product reviews are positive.
Next, I created a word cloud to show the most frequently used words in the text (review) column. Before starting, I checked for any null values and used natural language processing such as NLTK stopwords before generating my word cloud.
amazon.isna().sum()
Id 0 ProductId 0 UserId 0 ProfileName 16 HelpfulnessNumerator 0 HelpfulnessDenominator 0 Score 0 Time 0 Summary 27 Text 0 dtype: int64
The above code shows that column 'Text' doesn't have any null values.
text = " ".join(review for review in amazon.Text)
# Removing errors in Text column
stopwords = set(STOPWORDS)
stopwords.update(["br", "href"])
wordcloud = WordCloud(width = 3000, height = 2000, random_state = 1, stopwords=stopwords, background_color = "white", colormap = 'Set1', collocations = False).generate(text)
plot_cloud(wordcloud)
Next, I added a sentiment column by classifying only positive and negative reviews using the dataset's 'Score' column. For this sentiment, I categorized all positive reviews as scores > 3, negative for scores < 3, and dropped all neutral scores, which = 3. Note, the sentiment column will later be used as training data for the sentiment classification model.
amazon = amazon[amazon.Score != 3]
# Postive = 1
# Negative = -1
amazon ["Sentiment"] = amazon["Score"].apply(lambda x: -1 if x < 3 else +1)
amazon.head()
| Id | ProductId | UserId | ProfileName | HelpfulnessNumerator | HelpfulnessDenominator | Score | Time | Summary | Text | Sentiment | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | B001E4KFG0 | A3SGXH7AUHU8GW | delmartian | 1 | 1 | 5 | 1303862400 | Good Quality Dog Food | I have bought several of the Vitality canned d... | 1 |
| 1 | 2 | B00813GRG4 | A1D87F6ZCVE5NK | dll pa | 0 | 0 | 1 | 1346976000 | Not as Advertised | Product arrived labeled as Jumbo Salted Peanut... | -1 |
| 2 | 3 | B000LQOCH0 | ABXLMWJIXXAIN | Natalia Corres "Natalia Corres" | 1 | 1 | 4 | 1219017600 | "Delight" says it all | This is a confection that has been around a fe... | 1 |
| 3 | 4 | B000UA0QIQ | A395BORC6FGVXV | Karl | 3 | 3 | 2 | 1307923200 | Cough Medicine | If you are looking for the secret ingredient i... | -1 |
| 4 | 5 | B006K2ZZ7K | A1UQRSCLF8GW1T | Michael D. Bigham "M. Wassir" | 0 | 0 | 5 | 1350777600 | Great taffy | Great taffy at a great price. There was a wid... | 1 |
amazon.dtypes
Id int64 ProductId object UserId object ProfileName object HelpfulnessNumerator int64 HelpfulnessDenominator int64 Score int64 Time int64 Summary object Text object Sentiment int64 dtype: object
After building the sentiment column, I also created word clouds to display the most frequently used words for both positive and negative product reviews, respectfully. In addition, I made a product sentiment histogram to show the distribution of reviews with sentiment across the dataset.
# Postive Word Cloud
positive = amazon[amazon["Sentiment"] == 1]
text = " ".join(review for review in positive.Text)
text = text.replace('\n', "")
stopwords = set(STOPWORDS)
stopwords.update(["br", "href"])
wordcloud.postive = WordCloud(width = 3000, height = 2000, random_state = 1, stopwords=stopwords, background_color = "black", colormap = 'Set2', collocations = False).generate(text)
plot_cloud(wordcloud.postive)
# Negative Word Cloud
negative = amazon[amazon["Sentiment"] == -1]
text = " ".join(review for review in negative.Text)
text = text.replace('\n', "")
stopwords = set(STOPWORDS)
stopwords.update(["good", "great", "br", "href"])
wordcloud.negative = WordCloud(width = 3000, height = 2000, random_state = 1, stopwords=stopwords, background_color = "black", colormap = 'rainbow', collocations = False).generate(text)
plot_cloud(wordcloud.negative)
amazon ["Sentiment_Rate"] = amazon["Sentiment"].apply(lambda x: "Negative" if x == -1 else "Positive")
fig = px.histogram(amazon, x = "Sentiment_Rate")
fig.update_traces(marker_color = 'orange', marker_line_width=1.5)
fig.update_layout(title_text = "Product Sentiment")
fig.show()
From the orange histogram, we can see that the product sentiment is more positive than negative.
Finally, I created a text classification model to train and establish the accuracy of my data. I start by pre-processing the textual data using NLTK to remove special characters, lowercasing text, and stopwords. Then, I test the accuracy of the sentiment model by performing the Multi Nominal Naive Bayes Classification function using the scikit-learn library.
amazon.Summary = amazon['Summary'].str.replace('[^\w\s]','')
amazon.head()
<ipython-input-12-a813c68aaaa4>:1: FutureWarning: The default value of regex will change from True to False in a future version.
| Id | ProductId | UserId | ProfileName | HelpfulnessNumerator | HelpfulnessDenominator | Score | Time | Summary | Text | Sentiment | Sentiment_Rate | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | B001E4KFG0 | A3SGXH7AUHU8GW | delmartian | 1 | 1 | 5 | 1303862400 | Good Quality Dog Food | I have bought several of the Vitality canned d... | 1 | Positive |
| 1 | 2 | B00813GRG4 | A1D87F6ZCVE5NK | dll pa | 0 | 0 | 1 | 1346976000 | Not as Advertised | Product arrived labeled as Jumbo Salted Peanut... | -1 | Negative |
| 2 | 3 | B000LQOCH0 | ABXLMWJIXXAIN | Natalia Corres "Natalia Corres" | 1 | 1 | 4 | 1219017600 | Delight says it all | This is a confection that has been around a fe... | 1 | Positive |
| 3 | 4 | B000UA0QIQ | A395BORC6FGVXV | Karl | 3 | 3 | 2 | 1307923200 | Cough Medicine | If you are looking for the secret ingredient i... | -1 | Negative |
| 4 | 5 | B006K2ZZ7K | A1UQRSCLF8GW1T | Michael D. Bigham "M. Wassir" | 0 | 0 | 5 | 1350777600 | Great taffy | Great taffy at a great price. There was a wid... | 1 | Positive |
sentiment_df = amazon[["Summary", "Sentiment"]]
sentiment_df.head()
| Summary | Sentiment | |
|---|---|---|
| 0 | Good Quality Dog Food | 1 |
| 1 | Not as Advertised | -1 |
| 2 | Delight says it all | 1 |
| 3 | Cough Medicine | -1 |
| 4 | Great taffy | 1 |
Data Pre-Processing
df = sentiment_df
df["Summary"] = df["Summary"].astype(str)
# Change to lowercasing for all text reviews in 'Summary'
df["Summary"] = df["Summary"].apply(lambda x: " ".join(x.lower() for x in x.split()))
df["Summary"][2]
'delight says it all'
stop = set(stopwords)
df["Summary"] = df["Summary"].apply(lambda x: " ".join(x for x in x.split() if x not in stop ))
df["Summary"][2]
'delight says'
cv = CountVectorizer(token_pattern=r'\b\w+\b')
text_counts = cv.fit_transform(df["Summary"])
X_train, X_test, y_train, y_test = train_test_split(
text_counts, df["Sentiment"], test_size=0.3, random_state=1)
# Multinomial Naive Bayes Model
clf = MultinomialNB().fit(X_train, y_train)
predicted= clf.predict(X_test)
print("Multinomial Naive Bayes Accuracy:",metrics.accuracy_score(y_test, predicted))
Multinomial Naive Bayes Accuracy: 0.9048654473992837
As a result, the overall classification rate has an approx. 90.5% accuracy!